Reproducing “The Lasting Legacy of Redlining”

This notebook attempts to reproduce the figures in the FiveThirtyEight article “The Lasting Legacy of Redlining” originally authored by Ryan Best and Elena Mejía.

Step 1: Download Data

We obtain the project data from the FiveThirtyEight GitHub.

Step 2: Obtain Census Data

Use the tidycensus package to load shape files and 2020 census data.

This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau.

# Cache shapefiles
options(tigris_use_cache = TRUE)

# Get race data with shape files
# TODO: Allow regeneration of these files using the driver script.
# TODO: Allow users to invoke census API

blocks_file <- "../data/blocks.rds"

if(!file.exists(blocks_file)){
  blocks <- get_decennial(geography = "block", 
                          state="Pennsylvania",
                          variables = c("P2_001N", # Total
                                        "P2_002N", # Total Hispanic or Latino
                                        "P2_005N", # Total Non-Hispanic White
                                        "P2_006N", # Total Non-Hispanic Black
                                        "P2_007N", # Total Non-Hispanic Indian
                                        "P2_008N", # Total Non-Hispanic Asian
                                        "P2_009N", # Total Non-Hispanic Islander
                                        "P2_010N", # Total Non-Hispanic Other
                                        "P2_011N"), # Total Non-Hispanic two or more races
                          year = 2020, geometry = TRUE) %>% 
    pivot_wider(names_from = variable, values_from = value)

  # Manually cache the data to avoid downloading again
  saveRDS(blocks, file = "../data/blocks.rds")
} else {
  blocks <- read_rds(blocks_file)
}


states <- get_decennial(geography = "state", 
                         variables = c("STATE"), 
                         year = 2020, geometry = TRUE)
## Getting data from the 2020 decennial Census
## Using the PL 94-171 Redistricting Data summary file
## Note: 2020 decennial Census data use differential privacy, a technique that
## introduces errors into data to preserve respondent confidentiality.
## i Small counts should be interpreted with caution.
## i See https://www.census.gov/library/fact-sheets/2021/protecting-the-confidentiality-of-the-2020-census-redistricting-data.html for additional guidance.
## This message is displayed once per session.
# Get the total geography of each metro area
metros <- get_decennial(geography = "cbsa", 
                         variables = c("P2_001N"), 
                         year = 2020, geometry = TRUE)
## Getting data from the 2020 decennial Census
## Using the PL 94-171 Redistricting Data summary file
metros$NAME <- substr(metros$NAME, 1, nchar(metros$NAME)-11) # The 538 data has "Metro Area" and "Micro Area" removed from NAME
metro_grades <- left_join(metro_grades, metros, by=c("metro_area"="NAME")) # associate each metro area with a geography
metro_grades$centroid <- st_centroid(metro_grades$geometry)



# Filter for the blocks mapped by HOLC
holc_blocks <- inner_join(zone_matches, blocks, by=c("block_geoid20"="GEOID"))

# Select and reformat Pittsburgh
pitt <- holc_blocks %>% filter(holc_city == "Pittsburgh")

Step 3: Clean data to restrict to 10 percent buffer boundary

Step 4: Construct the map of the surrounding area

Bar Plots

surrounding_counts <- blocks_buffer %>% 
  summarize(white = sum(white),
            black = sum(black),
            hispanic = sum(hispanic),
            asian = sum(asian),
            other = sum(other_race))
surrounding_pct <- as.data.frame(t(round(surrounding_counts / sum(surrounding_counts), digits = 3)*100))
colnames(surrounding_pct) <- "Percentage"
surrounding_pct$Race <- c("White", "Black", "Latino", "Asian", "Other")
surrounding_pct$Race <- factor(surrounding_pct$Race, levels=rev(c("White", "Black", "Latino", "Asian", "Other")), ordered = T)
surrounding_pct$tmp <- factor("1", level=c("0","1"))
surrounding_pct$tmp_lab <- factor("1", level=c("0","1"))
surrounding_pct$label_loc <- cumsum(surrounding_pct$Percentage) - surrounding_pct$Percentage / 2
surrounding_pct$label <- paste(surrounding_pct$Race, " - ", surrounding_pct$Percentage, "%", sep="")

p <- ggplot() + geom_bar(data=surrounding_pct, aes(x = Percentage, y = tmp, fill = Race), color="white", lwd=1, width=0.2, stat="identity", show.legend=F) +
  geom_text(data=filter(surrounding_pct, Race %in% c("White", "Black")), aes(x = label_loc, y = tmp_lab, label = label), position = position_nudge(x = 0, y = -0.15), size = 4) +
  theme_void() +
  scale_fill_manual(values = rev(points$color))

surrounding_bar <- ggplotly(p, tooltip = c("Percentage", "Race"), width = 800, height = 200) %>%
  layout(
    showlegend = F, 
    legend = list(orientation = 'h'),
    yaxis = list(
            color = '#ffffff',
            gridcolor = '#ffff'),
    xaxis = list(
            color = '#ffffff',
            gridcolor = '#ffff')
  )
bar_grade <- function(metro_grades, grade){
  holc_pct <- as.data.frame(t(metro_grades %>% 
    filter(metro_area == "Pittsburgh, PA", holc_grade == grade) %>% 
    select(pct_white, pct_black, pct_hisp, pct_asian, pct_other)))
  colnames(holc_pct) <- "Percentage"
  holc_pct$Percentage <- round(holc_pct$Percentage, digits = 1)
  holc_pct$Race <- c("White", "Black", "Latino", "Asian", "Other")
  holc_pct$Race <- factor(holc_pct$Race, levels=rev(c("White", "Black", "Latino", "Asian", "Other")))
  holc_pct$Race2 <- holc_pct$Race # Need this so "Race" isn't duplicated due to grouping on ggplot
  holc_pct$tmp <- factor("1", level=c("0","1"))
  holc_pct$tmp_lab <- factor("1", level=c("0","1"))
  holc_pct$label_loc <- cumsum(holc_pct$Percentage) - holc_pct$Percentage / 2
  holc_pct$label <- paste(holc_pct$Race, " - ", holc_pct$Percentage, "%", sep="")

  p <- ggplot() + geom_bar(data=holc_pct, aes(x = Percentage, y = tmp, fill = Race, group = Race2), color="white", lwd=1, width=0.2, position="stack", stat="identity", show.legend=F) +
  geom_text(data=filter(holc_pct, Percentage > 6), aes(x = label_loc, y = tmp_lab, label = label), position = position_nudge(x = 0, y = -0.2), size = 4) +
  theme_void() +
  scale_fill_manual(values = rev(points$color))
  
  holc_bar <- ggplotly(p, tooltip = c("Percentage", "Race"), autosize = F, width = 800, height = 200) %>%
  layout(
    showlegend = F, 
    legend = list(orientation = 'h'),
    yaxis = list(
            color = '#ffffff',
            gridcolor = '#ffff'),
    xaxis = list(
            color = '#ffffff',
            gridcolor = '#ffff')
  )
  
  holc_bar
}

Add Pennsylvania to the plot

# The grid package allows us to print grobs to the graphics device by manipulating the viewport
# The ggplotify package allows us to turn arbitrary ggplot objects into grobs
# Both of these we use to overlay plots created with geom_sf previously
construct_surrounding_plot <- function(surrounding_area, PA_plot){
  grid.newpage()
  grid.draw(as.grob(surrounding_area))
  # Place Pennsylvania in the correct frame of reference relative to the surrounding area
  sample_vp <- viewport(x = 0.6, y = 0.05, 
                        width = 0.25, height = 0.25,
                        just = c("left", "bottom"),
                        angle = 15)
  pushViewport(sample_vp)
  grid.draw(as.grob(PA_plot))
  popViewport()
}

Step 5: Construct maps of each zone type

# Sample from every map grade
map_grade <- function(df, grade){
  samp <- df %>% filter(holc_grade == grade)
  print(sprintf("Saving Hispanic %s...", grade))
  hispanic_path <- sprintf("../data/sampled_hispanic_%s.rds", grade) 
  if(!file.exists(hispanic_path)){
    samp_hispanic <- samp %>% filter(hispanic > 0)
    output <- st_sample(samp_hispanic$geometry2, samp_hispanic$hispanic, type="random", exact=F)
    saveRDS(output, file = hispanic_path)
  }
  points_hispanic <- read_rds(hispanic_path)
  
  print(sprintf("Saving Other %s...", grade))
  other_path <- sprintf("../data/sampled_other_%s.rds", grade)
  if(!file.exists(other_path)){
    samp_other <- samp %>% filter(other_race > 0)
    output <- st_sample(samp_other$geometry2, samp_other$other_race, type="random", exact=F)
    saveRDS(output, file = other_path)
  }
  points_other <- read_rds(other_path)
  
  print(sprintf("Saving Asian %s...", grade))
  asian_path <- sprintf("../data/sampled_asian_%s.rds", grade)
  if(!file.exists(asian_path)){
    samp_asian <- samp %>% filter(asian > 0)
    output <- st_sample(samp_asian$geometry2, samp_asian$asian, type="random", exact=F)
    saveRDS(output, file = asian_path)
  }
  points_asian <- read_rds(asian_path)
  
  print(sprintf("Saving Black %s...", grade))
  black_path <- sprintf("../data/sampled_black_%s.rds", grade)
  if(!file.exists(black_path)){
    samp_black <- samp %>% filter(black > 0)
    output <- st_sample(samp_black$geometry2, samp_black$black, type="random", exact=F)
    saveRDS(output, file = black_path)
  }
  points_black <- read_rds(black_path)
  
  print(sprintf("Saving White %s...", grade))
  white_path <- sprintf("../data/sampled_white_%s.rds", grade)
  if(!file.exists(white_path)){
    samp_white <- samp %>% filter(white > 0)
    output <- st_sample(samp_white$geometry2, samp_white$white, type="random", exact=F)
  saveRDS(output, file = white_path)
  }
  points_white <- read_rds(white_path)
}

holc_samp <- inner_join(zone_matches, samp, by=c("block_geoid20"="GEOID"))

for(grade in c("A","B","C","D")){
  if(!file.exists(sprintf("../data/sampled_white_%s.rds", grade)) ||
  !file.exists(sprintf("../data/sampled_black_%s.rds", grade)) ||
  !file.exists(sprintf("../data/sampled_hispanic_%s.rds", grade)) ||
  !file.exists(sprintf("../data/sampled_asian_%s.rds", grade)) ||
  !file.exists(sprintf("../data/sampled_other_%s.rds", grade))){
    map_grade(holc_samp, grade)
  }
}

# Construct HOLC plots
holc_plots <- list()

grades <- c(A = "Best", B = "Desirable", C = "Declining", D = "Hazardous")

for(i in c("A","B","C","D")){
  points[[i]] <- c(st_combine(read_rds(sprintf("../data/sampled_white_%s.rds", i))),
       st_combine(read_rds(sprintf("../data/sampled_black_%s.rds", i))),
       st_combine(read_rds(sprintf("../data/sampled_hispanic_%s.rds", i))),
       st_combine(read_rds(sprintf("../data/sampled_asian_%s.rds", i))),
       st_combine(read_rds(sprintf("../data/sampled_other_%s.rds", i)))
       )
  
  holc_plots[[i]] <- ggmap(m) +
  geom_sf(data = cutout, mapping = aes(geometry=geometry), color="white", fill="white", inherit.aes = FALSE) +
  geom_sf(data = bound, mapping = aes(geometry=geometry), color="black", fill=NA, lwd = 0.5, inherit.aes = FALSE) +
  geom_sf(data = points, mapping = aes(geometry=.data[[i]], color=race), size=0.01, inherit.aes = FALSE, show.legend=F) + 
  labs(title = sprintf("Pittsburgh's %s Zones", grades[[i]])) + 
  scale_color_manual(values = points$color) +
  theme_void() + theme(plot.title=element_text(margin = margin(b=10), hjust = 0.45))
}
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Step 6: Produce Plots!

# Plot cleveland bar plot
cleveland <- metro_grades %>% filter(metro_area == "Cleveland-Elyria, OH") %>% select(holc_grade, pct_white, pct_black, pct_hisp, pct_asian, pct_other) %>% pivot_longer(pct_white:pct_other)
cleveland$holc_grade <- factor(cleveland$holc_grade, levels=rev(c("A","B","C","D")))
cleveland$holc_grade <- recode(cleveland$holc_grade, A="\"Best\"", B="\"Desirable\"", C="\"Declining\"", D="\"Hazardous\"")
cleveland$name <- recode(cleveland$name, pct_white="White", pct_black="Black", pct_hisp="Latino", pct_asian="Asian", pct_other = "Other")
cleveland$name <- factor(cleveland$name, levels=rev(c("White", "Black", "Latino", "Asian", "Other")))

ggplot(cleveland) + geom_bar(aes(x = value, y = holc_grade, fill = name), color="white", lwd=1, width=0.8, stat="identity", show.legend=T) +
  theme_void() +
  theme(axis.text.y = element_text(),
        legend.position="top",
        legend.title=element_blank()) +
  scale_fill_manual(guide=guide_legend(reverse=T), values = rev(points$color))

### DIFFERENT STYLE FROM ARTICLE ###

metros_A <- metro_grades %>% filter(holc_grade == grade)
  
  # Turn the points into longitude and latitude so we can repel the points on the ggplot
  metros_A$lon <- map_dbl(metros_A$centroid, function(x){x[[1]]})
  metros_A$lat <- map_dbl(metros_A$centroid, function(x){x[[2]]})
  metros_A$pct_nonwhite <- 100 - metros_A$pct_white
  metros_A$rad <- (metros_A$value^(1/3)) / 5000

nation_pie <- function(metro_grades, grade){
  metros_A <- metro_grades %>% filter(holc_grade == grade)
  
  # Turn the points into longitude and latitude so we can repel the points on the ggplot
  metros_A$lon <- map_dbl(metros_A$centroid, function(x){x[[1]]})
  metros_A$lat <- map_dbl(metros_A$centroid, function(x){x[[2]]})
  metros_A$pct_nonwhite <- 100 - metros_A$pct_white
  metros_A$rad <- (metros_A$value^(1/3)) / 200
  
  mas <- metros_A %>% select(lon, lat, pct_white, pct_nonwhite, value, rad) %>% arrange(desc(value))
  
  end <- nrow(mas)
  for(i in 1:(nrow(mas)-1)){
    cur <- mas[i,]
    lon_dist <- cur$lon - mas$lon[(i+1):end]
    lat_dist <- cur$lat - mas$lat[(i+1):end]
    dists <- sqrt((lon_dist)^2 + (lat_dist)^2)
    
    move <-  mas$rad[i] / dists
    move[move <= 1] <- 0
    
    mas$lon[(i+1):end] <- mas$lon[(i+1):end] - lon_dist*move
    mas$lat[(i+1):end] <- mas$lat[(i+1):end] - lat_dist*move
  }
  
  title <- ifelse(grade == "A", "Best", "Hazardous")
  
  ggplot() + 
    ggtitle(title) + 
    geom_sf(data=filter(states, !(NAME %in% c("Alaska", "Hawaii", "Puerto Rico"))) , mapping = aes(geometry = geometry)) +
    #geom_jitter(data=metros_A, mapping = aes(x = lon, y = lat)) + 
    geom_scatterpie(data=mas, 
                    mapping = aes(x = lon, y = lat, r = rad), 
                    cols=c("pct_white", "pct_nonwhite"), 
                    color = "white", show.legend=F) + 
    scale_fill_manual(values = c("#ffb262","#808285")) +
    theme_void() + 
    theme(plot.title = element_text(hjust = 0.5))
}

A <- nation_pie(metro_grades, "A")
D <- nation_pie(metro_grades, "D")
grid.arrange(A, D, nrow=1)

Pittsburgh, PA:

“BEST”

“DESIRABLE”

“DECLINING”

“HAZARDOUS”